Name: Daniel Niazov
ID: 207437997
Kaggle: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.
!pip install --upgrade plotly
!pip install sweetviz
In this notebook I'm tried to predict house prices by using linear regression on different data divisions and feature selection and also by regulation.
# import numpy, matplotlib, etc.
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# sklearn imports
from sklearn import metrics
from sklearn import pipeline
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeavePOut
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# load "train" data set
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
test_ids = test['Id']
train.head(10)
train.info()
train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id'], axis=1, inplace=True)
test.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id'], axis=1, inplace=True)
# import sweetviz and show report on train
import sweetviz as sw
train_report = sw.analyze(train)
train_report.show_notebook(layout='vertical')
corr = train.corr()
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
plt.figure(figsize=(15, 15))
sns.heatmap(corr, linewidths=.005)
plt.show()
#saleprice correlation matrix
k = 34 #number of variables for heatmap
cols = corr.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
plt.figure(figsize=(20, 20))
hm = sns.heatmap(cm, annot=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
cols
# count empty values in each column
def count_empty_values_in_each_column(df):
numerical_cols = []
for col in df.columns:
value_missing = np.mean(df[col].isnull())
if value_missing > 0:
if df[col].dtype == 'float64':
numerical_cols.append(col)
print('{} - {:.2f}% ;{}'.format(col, value_missing*100,df[col].dtype))
return numerical_cols
count_empty_values_in_each_column(train)
# fill empty values by median value for each column in the dataframe
def fill_na_median(df, column_names):
for column_name in column_names:
df_not_null = df[~df[column_name].isnull()]
df[column_name].fillna(df_not_null[column_name].median(), inplace=True)
# fill empty values by mean value for each column in the dataframe
def fill_na_mean(df, column_names):
for column_name in column_names:
df_not_null = df[~df[column_name].isnull()]
df[column_name].fillna(df_not_null[column_name].mean(), inplace=True)
# fill empty values by mode value for each column in the dataframe
def fill_na_mode(df, column_names):
for column_name in column_names:
df[column_name].fillna(df[column_name].mode()[0], inplace=True)
# import make_subplots and create pie charts subplots of the categorical features
from plotly.subplots import make_subplots
def create_pie_chart_subplot_of_count(df, columns_names):
rows = int(np.ceil(np.sqrt(len(columns_names))))
cols = int(np.ceil(len(columns_names)/rows))
fig = make_subplots(rows=rows, cols=cols, specs=[[{"type": "domain"} for i in range(cols)] for j in range(rows)])
for i, column_name in enumerate(columns_names):
df_not_null = df[~df[column_name].isnull()]
fig.add_trace(go.Pie(labels=df_not_null.groupby([column_name]).size().reset_index(name='count')[column_name],
values=df_not_null.groupby([column_name]).size().reset_index(name='count')['count'],
name=column_name),
(i)//cols+1, (i)%cols+1)
fig.update_layout(margin=dict(t=10, l=10, r=10, b=10))
fig.show()
count_empty_values_in_each_column(train)
train['LotFrontage'] = train.groupby(['Neighborhood', 'Street'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
train.MasVnrArea.fillna(train.MasVnrArea.median(), inplace=True)
train.GarageYrBlt.fillna(train.MasVnrArea.median(), inplace = True)
train.BsmtFinType1.fillna("NB", inplace=True)
train.BsmtFinType2.fillna("NB", inplace=True)
train.BsmtExposure.fillna("NB", inplace=True)
train.BsmtQual.fillna("NB", inplace=True)
train.BsmtCond.fillna("No", inplace=True)
train.GarageCond.fillna("No", inplace = True)
train.GarageQual.fillna("No", inplace = True)
train.GarageFinish.fillna("No", inplace = True)
train.GarageType.fillna("No", inplace = True)
train.FireplaceQu.fillna("No", inplace = True)
train.Electrical.fillna("SBrkr", inplace = True)
train.MasVnrType.fillna("NMV", inplace = True)
count_empty_values_in_each_column(train)
# divide the data to features and target
t = train['SalePrice'].copy()
X = train.drop(['SalePrice'], axis=1)
print('t')
display(t)
print()
print('X')
display(X)
# find best subset of features on this dataset
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns
all_cols = list(categorical_cols) + list(numerical_cols)
ct_enc_std = ColumnTransformer([
("encoding", OrdinalEncoder(), categorical_cols),
("standard", StandardScaler(), numerical_cols)])
X_encoded = pd.DataFrame(ct_enc_std.fit_transform(X, t), columns=all_cols)
selector = RFECV(SGDRegressor(random_state=1), cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)).fit(X_encoded, t)
display(X_encoded.loc[:, selector.support_])
fig = go.Figure()
results = selector.cv_results_['mean_test_score'] # Getting the mean cv score for each set of features
fig.add_trace(go.Scatter(x=[i for i in range(1, len(results) + 1)], y=results))
fig.update_xaxes(title_text="Number of features selected")
fig.update_yaxes(title_text="Cross validation score (nb of correct classifications)")
fig.show()
cols = cols.drop(["SalePrice"])
# X_encoded_selected_features = X_encoded.loc[:, selector.support_]
X_encoded_selected_features = X_encoded.loc[:, cols]
X_encoded_selected_features
train.head(10)
hyper_parameters = {'penalty': ('l2', 'l1', 'elasticnet'), 'alpha':[0.0001, 0.001, 0.01, 0.1], 'learning_rate':['constant'], 'eta0':[0.0001, 0.001, 0.01, 0.1]}
gs_model1 = GridSearchCV(SGDRegressor(random_state=73), hyper_parameters).fit(X_encoded, np.log(t))
print('best parameters model 1: ', gs_model1.best_params_)
gs_model2 = GridSearchCV(SGDRegressor(random_state=73), hyper_parameters).fit(X_encoded_selected_features, np.log(t))
print('best parameters model 2: ', gs_model2.best_params_)
model1 = SGDRegressor(alpha=gs_model1.best_params_['alpha'], learning_rate='constant', eta0=gs_model1.best_params_['eta0'], penalty=gs_model1.best_params_['penalty'])
model2 = SGDRegressor(alpha=gs_model2.best_params_['alpha'], learning_rate='constant', eta0=gs_model2.best_params_['eta0'], penalty=gs_model2.best_params_['penalty'])
def find_generator_len(generator, use_pbar=True):
i = 0
if use_pbar:
pbar = tqdm(desc='Calculating Length', ncols=1000, bar_format='{desc}{bar:10}{r_bar}')
for a in generator:
i += 1
if use_pbar:
pbar.update()
if use_pbar:
pbar.close()
return i
# calculate score and loss from cv (KFold or LPO) and display graphs
def CVscoreLoss(x, t, model, k=None, p=None, show_score_loss_graphs=False, use_pbar=True):
scoresLosses_df = pd.DataFrame(columns=['fold_id', 'split', 'score', 'loss'])
if k is not None:
cv = KFold(n_splits=k, shuffle=True, random_state=73)
elif p is not None:
cv = LeavePOut(p)
else:
raise ValueError("Missing k or p to calculate Cross Validation")
if use_pbar:
pbar = tqdm(desc='Calculating Models', total=find_generator_len(cv.split(x)))
for i, (train_ids, val_ids) in enumerate(cv.split(x)):
x_train = x.loc[train_ids] # Create train x and t
t_train = t.loc[train_ids]
x_val = x.loc[val_ids] # Create valid x and t
t_val = t.loc[val_ids]
model.fit(x_train, t_train) #train the model
y_train = model.predict(x_train) # Getting predictions
y_val = model.predict(x_val)
y_train = np.exp(y_train)
y_val = np.exp(y_val)
scoresLosses_df.loc[len(scoresLosses_df)] = [i, 'train', model.score(x_train, t_train), mean_squared_error(t_train, np.log(y_train), squared = False)]
scoresLosses_df.loc[len(scoresLosses_df)] = [i, 'val', model.score(x_val, t_val), mean_squared_error(t_val, np.log(y_val), squared = False)]
if use_pbar:
pbar.update()
if use_pbar:
pbar.close()
val_scores_losses_df = scoresLosses_df[scoresLosses_df['split']=='val']
train_scores_losses_df = scoresLosses_df[scoresLosses_df['split']=='train']
mean_val_score = val_scores_losses_df['score'].mean()
mean_val_loss = val_scores_losses_df['loss'].mean()
mean_train_score = train_scores_losses_df['score'].mean()
mean_train_loss = train_scores_losses_df['loss'].mean()
if show_score_loss_graphs:
fig = px.line(scoresLosses_df, x='fold_id', y='score', color='split', title=f'Mean Val Score: {mean_val_score:.2f}, Mean Train Score: {mean_train_score:.2f}')
fig.show()
fig = px.line(scoresLosses_df, x='fold_id', y='loss', color='split', title=f'Mean Val Loss: {mean_val_loss:.2f}, Mean Train Loss: {mean_train_loss:.2f}')
fig.show()
print("Model 1, k = 10 Figures: ")
CVscoreLoss(X_encoded_selected_features, np.log(t), model1, k=10, show_score_loss_graphs=True, use_pbar=True )
print("Model 2, k = 10 Figures: ")
CVscoreLoss(X_encoded_selected_features, np.log(t), model2, k=10, show_score_loss_graphs=True, use_pbar=True )
print("Model 1, k = 5 Figures: ")
CVscoreLoss(X_encoded_selected_features, np.log(t), model1, k=5, show_score_loss_graphs=True, use_pbar=True )
print("Model 2, k = 5 Figures: ")
CVscoreLoss(X_encoded_selected_features, np.log(t), model2, k=5, show_score_loss_graphs=True, use_pbar=True )
test = test[list(X_encoded_selected_features.columns)] # Drop what we don't care about. Like me feelings
numerical_cols_test = test.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_test = test.select_dtypes(include=['object']).columns
ct_enc_std_test = ColumnTransformer([
("encoding", OrdinalEncoder(), categorical_cols_test),
("standard", StandardScaler(), numerical_cols_test)])
x_encoded_test = pd.DataFrame(ct_enc_std_test.fit_transform(test), columns=X_encoded_selected_features.columns) #activate the encoder and scaler for all cols
x_encoded_test
missin_col = count_empty_values_in_each_column(x_encoded_test)
test_pred_model = model2.predict(x_encoded_test)
test_pred_model_exp = np.exp(test_pred_model)
result = pd.DataFrame()
result['Id'] = test_ids
result['SalePrice'] = test_pred_model_exp
result.to_csv('submission.csv', index = False)
print(result)
I show a data investigation on the data which included checking if all the examples in the columns have values and if not, I used sweetviz and autoviz to display as much data on the data including graphs and correlations between features. After testing whether the correlation to the target feature is less than 0.1 I decided to drop those features from the data. I performed different experiments using feature selection so that in the end I was able to choose the best model on which to perform the prediction for the test.